library(ggplot2)
library(e1071)
library(class)
library(tm)
library(tidyr)
library(plyr)
library(jsonlite)
library(dplyr)
library(tidyverse)
library(e1071)
library(mvtnorm)
library(caret)
library(mapproj)
library(plotly)
library(usmap)
library(maps)
library(scales)
library(tidymodels)
library(kknn)
#1. How many breweries are present in each state?
######Import Data#######
Beer = read.csv(file.choose(),header = TRUE)
Breweries = read.csv(file.choose(),header = TRUE)
######Count#######
Count <- Breweries %>% group_by(State) %>% summarise(Brew_ID = n())
Count
## # A tibble: 51 × 2
## State Brew_ID
## <chr> <int>
## 1 " AK" 7
## 2 " AL" 3
## 3 " AR" 2
## 4 " AZ" 11
## 5 " CA" 39
## 6 " CO" 47
## 7 " CT" 8
## 8 " DC" 1
## 9 " DE" 2
## 10 " FL" 15
## # … with 41 more rows
######Plot of Count by State#######
Breweries %>% ggplot(aes(x = State, fill=State)) + geom_bar() + ggtitle("Count of Breweries in each State") + ylab("Number of Breweries") + geom_text(aes(label = ..count..), stat = "count", vjust=-.2, colour = "black") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
#2. Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file. (RMD only, this does not need to be included in the presentation or the deck.)
######Merge Data#######
Beer <- Beer %>% rename(Brew_ID = Brewery_id)
Beer <- Beer %>% rename(Beer_Name = Name)
Breweries <- Breweries %>% rename(Brewery_Name = Name)
Brewery_Beer <- merge(Beer,Breweries, by = "Brew_ID")
######Print Head and Tail#######
head(Brewery_Beer,6)
## Brew_ID Beer_Name Beer_ID ABV IBU Style
## 1 1 Get Together 2692 0.045 50 American IPA
## 2 1 Maggie's Leap 2691 0.049 26 Milk / Sweet Stout
## 3 1 Wall's End 2690 0.048 19 English Brown Ale
## 4 1 Pumpion 2689 0.060 38 Pumpkin Ale
## 5 1 Stronghold 2688 0.060 25 American Porter
## 6 1 Parapet ESB 2687 0.056 47 Extra Special / Strong Bitter (ESB)
## Ounces Brewery_Name City State
## 1 16 NorthGate Brewing Minneapolis MN
## 2 16 NorthGate Brewing Minneapolis MN
## 3 16 NorthGate Brewing Minneapolis MN
## 4 16 NorthGate Brewing Minneapolis MN
## 5 16 NorthGate Brewing Minneapolis MN
## 6 16 NorthGate Brewing Minneapolis MN
tail(Brewery_Beer,6)
## Brew_ID Beer_Name Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Brewery_Name City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
#3. Address the missing values in each column.
colSums(is.na(Brewery_Beer))
## Brew_ID Beer_Name Beer_ID ABV IBU Style
## 0 0 0 62 1005 0
## Ounces Brewery_Name City State
## 0 0 0 0
nrow(Brewery_Beer)
## [1] 2410
BB_NoNA = na.omit(Brewery_Beer)
colSums(is.na(BB_NoNA))
## Brew_ID Beer_Name Beer_ID ABV IBU Style
## 0 0 0 0 0 0
## Ounces Brewery_Name City State
## 0 0 0 0
nrow(BB_NoNA)
## [1] 1405
#4. Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.
##########Median ABV and IBU for each State#############
ST_ABV <- aggregate(x= BB_NoNA$ABV, by = list(BB_NoNA$State),FUN = median)
ST_IBU <- aggregate(x= BB_NoNA$IBU, by = list(BB_NoNA$State),FUN = median)
ST_ABV <- ST_ABV %>% rename(Med_ABV = x,State = Group.1)
ST_IBU <- ST_IBU %>% rename(Med_IBU = x,State = Group.1)
######Turn ABV into a %#######
ST_ABV$Med_ABV_Rounded <- round(ST_ABV$Med_ABV ,digit=4)
ST_ABV$Med_ABVPCT <- percent(ST_ABV$Med_ABV_Rounded, accuracy = .01)
######Round INB for Scaling######
ST_IBU$Med_IBU_Rounded <- round(ST_IBU$Med_IBU ,digit=2)
######Plot ABV######
ST_ABV$State <- fct_reorder(ST_ABV$State, ST_ABV$Med_ABV)
ST_ABV %>% ggplot(aes(x = State, y=Med_ABVPCT, fill=State)) + geom_bar(stat="identity") + ggtitle("Med_ABV by State") + ylab("Med_ABV") + geom_text(aes(label = Med_ABVPCT), vjust=0, size= 2, colour = "black")
######Plot IBU######
ST_IBU$State <- fct_reorder(ST_IBU$State, ST_IBU$Med_IBU)
ST_IBU %>% ggplot(aes(x = State, y=Med_IBU_Rounded, fill=State)) + geom_bar(stat="identity") + ggtitle("Med_IBU by State") + ylab("Med_IBU") + geom_text(aes(label = Med_IBU_Rounded), vjust=0, size=2, colour = "black")
######Merge for a cross report######
ST_IBU_ABV <- merge(ST_IBU,ST_ABV, by = "State")
ST_IBU_ABV %>% ggplot(aes(x = Med_ABVPCT, y = Med_IBU_Rounded, color = State)) + geom_jitter() + ggtitle("Med_IBU and Mean_ABV") + xlab("Med_ABV") + ylab("Med_IBU")
######Create Bins for reporting######
ABVPCTFact = cut(ST_IBU_ABV$Med_ABV, breaks = c(.01,.05,.06,.07), labels = c("Low","Medium","High"))
######Plot the cross report######
ST_IBU_ABV %>% mutate(ABVPCTFact = ABVPCTFact) %>% ggplot(aes(x = Med_ABVPCT, y = Med_IBU_Rounded, color = ABVPCTFact)) + geom_jitter() + ggtitle("Med_IBU and Med_ABV") + xlab("Med_ABV") + ylab("Med_IBU")
#5. Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
Brewery_Beer %>% slice_max(ABV)
## Brew_ID Beer_Name Beer_ID ABV
## 1 52 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale 2565 0.128
## IBU Style Ounces Brewery_Name City State
## 1 NA Quadrupel (Quad) 19.2 Upslope Brewing Company Boulder CO
Brewery_Beer %>% slice_min(ABV)
## Brew_ID Beer_Name Beer_ID ABV IBU Style Ounces
## 1 523 Scotty K NA 606 0.001 NA Low Alcohol Beer 16
## Brewery_Name City State
## 1 Uncommon Brewers Santa Cruz CA
#6. Comment on the summary statistics and distribution of the ABV variable.
summary(BB_NoNA)
## Brew_ID Beer_Name Beer_ID ABV
## Min. : 1.0 Length:1405 Min. : 1 Min. :0.02700
## 1st Qu.: 95.0 Class :character 1st Qu.: 772 1st Qu.:0.05000
## Median :198.0 Mode :character Median :1439 Median :0.05700
## Mean :224.2 Mean :1415 Mean :0.05991
## 3rd Qu.:351.0 3rd Qu.:2069 3rd Qu.:0.06800
## Max. :547.0 Max. :2692 Max. :0.12500
## IBU Style Ounces Brewery_Name
## Min. : 4.00 Length:1405 Min. : 8.40 Length:1405
## 1st Qu.: 21.00 Class :character 1st Qu.:12.00 Class :character
## Median : 35.00 Mode :character Median :12.00 Mode :character
## Mean : 42.71 Mean :13.51
## 3rd Qu.: 64.00 3rd Qu.:16.00
## Max. :138.00 Max. :32.00
## City State
## Length:1405 Length:1405
## Class :character Class :character
## Mode :character Mode :character
##
##
##
hist(BB_NoNA$ABV)
ABVLevel = cut(BB_NoNA$ABV, breaks = c(.00,.049,.089,.13), labels = c("Low","Medium","High"))
BB_NoNA %>% mutate(ABVLevel= ABVLevel) %>% ggplot(aes(x = ABV, y= IBU, fill=ABVLevel)) + geom_histogram(stat="identity") + ggtitle("Distribution by ABV Level")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
#7. Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer.
BB_NoNA$ABV_Rounded <- round(BB_NoNA$ABV ,digit=3)
BB_NoNA %>% mutate(ABVLevel= ABVLevel) %>% ggplot(aes(x = IBU, y = ABV, color = ABVLevel)) + geom_jitter() + ggtitle("ABV vs. IBU by Levels") + xlab("International Bitterness Unit") + ylab("Alcohol By Volume")
#8. Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA). You decide to use KNN classification to investigate this relationship. Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand conceptually. In addition, while you have decided to use KNN to investigate this relationship (KNN is required) you may also feel free to supplement your response to this question with any other methods or techniques you have learned. Creativity and alternative solutions are always encouraged.
## First, we set NA for blank values as a catch-all so that when we retrieve IPA and Ale's we do not run across the error of pulling null values.
Brewery_Beer$Class1 <- NA
for(i in 1:nrow(Brewery_Beer)){
if(grepl("IPA",Brewery_Beer$Style[i]) == TRUE | grepl("India Pale Ale", Brewery_Beer$Style[i]) == TRUE){
Brewery_Beer$Class1[i] <- "IPA"
} else if(grepl("Ale", Brewery_Beer$Style[i]) == TRUE & !grepl("India Pale Ale", Brewery_Beer$Style[i]) == TRUE & !grepl("IPA", Brewery_Beer$Style[i]) == TRUE){
Brewery_Beer$Class1[i] <- "Ale"
} else{
Brewery_Beer$Class1[i] <- NA
}
}
#In this section, we run our KNN test with a randomly selected seed and the original data set with Null values for ABV and IBU still included. We decided to use the original set with null values as that contained a greater set of names than the removed NAs data set.
set.seed(200)
Brewery_BeerKnn <- na.omit(Brewery_Beer)
splitPerc <- 0.80
trainIndices <- sample(1:dim(Brewery_BeerKnn)[1], round(splitPerc * dim(Brewery_BeerKnn)[1]))
train <- Brewery_BeerKnn[trainIndices,]
test <- Brewery_BeerKnn[-trainIndices,]
train$ABV <- scale(train$ABV)
train$IBU <- scale(train$IBU)
test$ABV <- scale(test$ABV)
test$IBU <- scale(test$IBU)
accuracy <- c()
k <- c()
for(i in 1:90){
classifications <- knn(train[, c(4, 5)], test[, c(4, 5)], as.factor(train$Class1), prob = TRUE, k = i)
CM <- confusionMatrix(table(classifications, as.factor(test$Class1)))
accuracy[i] <- CM$overall[1]
k[i] <- i
if(i == 1){
max_k <- i
accuracy_max <- accuracy[i]
}else if(accuracy[i] >= accuracy_max){
max_k <- i
accuracy_max <- accuracy[i]
}
}
plot(k, accuracy, type = "l", xlab = "k")
print(paste0("Max k:", max_k))
## [1] "Max k:36"
print(paste0("Max accuracy:", accuracy[max_k]))
## [1] "Max accuracy:0.894179894179894"
classifications <- knn(train[, c(4, 5)], test[, c(4, 5)], train$Class1, prob = TRUE, k = max_k)
CM <- confusionMatrix(table(classifications, test$Class1))
print(CM)
## Confusion Matrix and Statistics
##
##
## classifications Ale IPA
## Ale 108 10
## IPA 10 61
##
## Accuracy : 0.8942
## 95% CI : (0.8413, 0.9342)
## No Information Rate : 0.6243
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7744
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9153
## Specificity : 0.8592
## Pos Pred Value : 0.9153
## Neg Pred Value : 0.8592
## Prevalence : 0.6243
## Detection Rate : 0.5714
## Detection Prevalence : 0.6243
## Balanced Accuracy : 0.8872
##
## 'Positive' Class : Ale
##
library(plotly)
x_test <- test %>% select("ABV", "IBU")
y_test <- test %>% select("Class1")
yscore <- knn(train[, c(4, 5)], test[, c(4, 5)], train$Class1, prob = TRUE, k = max_k)
yscore <- attributes(yscore)$prob
pdb <- cbind(x_test, y_test)
pdb <- cbind(pdb, yscore)
fig <- plot_ly(data = pdb,x = ~IBU, y = ~ABV, type = 'scatter', mode = 'markers',color = ~yscore, colors = 'RdBu', symbol = ~Class1, split = ~Class1, symbols = c('square-dot','circle-dot'), marker = list(size = 12, line = list(color = 'black', width = 1)))
fig
ggplot(train,aes(x=IBU,y=ABV,colour=Class1))+geom_point(size=0.3)+geom_density2d()
ggplot(train,aes(x=IBU,y=ABV,colour=Class1)) + geom_jitter() +geom_density2d() + ggtitle("Density Plot of ABV vs IBU")
fit.IBU <- aov(IBU~Style, data = Brewery_Beer)
print("ANOVA IBU")
## [1] "ANOVA IBU"
summary(fit.IBU)
## Df Sum Sq Mean Sq F value Pr(>F)
## Style 90 707451 7861 43.34 <2e-16 ***
## Residuals 1314 238303 181
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1005 observations deleted due to missingness
fit.ABV <- aov(ABV~Style, data = Brewery_Beer)
print("ANOVA ABV")
## [1] "ANOVA ABV"
summary(fit.ABV)
## Df Sum Sq Mean Sq F value Pr(>F)
## Style 99 0.2703 2.73e-03 38.35 <2e-16 ***
## Residuals 2248 0.1601 7.12e-05
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 62 observations deleted due to missingness
#There is significant evidence to prove that at least one Style has a different IBU, and one style has a different ABV (p-value < 2e-16).As a result, one can infer that different styles are characterized by different combinations of bitterness and alcohol level.
Brewery_Beer %>% ggplot(aes(x = IBU, y = ABV, color = Style)) + geom_point() + theme(legend.position = "None") + ggtitle("ABV vs IBU by Style")
## Warning: Removed 1005 rows containing missing values (geom_point).